This script is aimed at showing how the Principal Components Analysis (PCA) behaves for sinusoidal patterned data. The idea is that if the gene patterns have high signal to noise variation, then the PC1 vs PC2 plot would be close to a circle. The more we add noise and randomness to the sinusoidal patterns, the less obvious the circular pattern of the PCs would tend to be.
We present a non-sinusoidal gene patterns scenario now.
library(cellcycleR)
library(wavethresh)
## Loading required package: MASS
## WaveThresh: R wavelet software, release 4.6.6, installed
##
## Copyright Guy Nason and others 1993-2013
##
## Note: nlevels has been renamed to nlevelsWT
G <- 100;
num_cells <- 256;
amp_genes <- rep(10, G);
phi_genes <- rep(c(2,4,6,8), each=G/4);
sigma_genes <- rchisq(G, 0.01);
cell_times_sim <- seq(0,2*pi, length.out=num_cells);
cycle_data <- sim_sinusoidal_cycle(G, amp_genes, phi_genes, sigma_genes, cell_times_sim);
celltime_levels <- 256;
sample_reorder <- sample(1:num_cells,num_cells, replace=FALSE);
cell_times_reorder <- cell_times_sim[sample_reorder];
cycle_data_reorder <- cycle_data[sample_reorder,];
We perform PCA on the reordered data.
pca_cycle <- prcomp(cycle_data_reorder, center=TRUE, scale. = TRUE);
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
Note the PCA maps the points on the circle. We now apply sinusoidal cellcycleR on the data.
system.time(out_sinusoidal <- sin_cell_ordering_class(cycle_data_reorder, celltime_levels = 256, num_iter=100, verbose=FALSE))
## Final loglikelihood, iter 100 : 741912.131013
## user system elapsed
## 144.569 51.390 66.406
We plot the radial plots first.
library(plotrix)
library(RColorBrewer)
radial.plot(lengths=1:length(out_sinusoidal$cell_times),radial.pos=out_sinusoidal$cell_times[order(cell_times_reorder)],
line.col=colorRampPalette(brewer.pal(9,"Blues"))(length(out_sinusoidal$cell_times)), lwd=2)
We plot the patterns for two genes (for arbitrary indices)
plot(cycle_data_reorder[order(out_sinusoidal$cell_times),1], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data[,1], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data_reorder[order(out_sinusoidal$cell_times),60], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data[,60], type="l", xlab="cell times", ylab="gene expression")
Now we rank the cells based on the cell times estimated from sinusoidal cellcycleR and then plot the ranks against the ranks based on the first PC.
plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,1]), xlab="sin. cellcycleR rank", ylab="pca 1 rank", pch=20, lwd=0.5, cex=1, col="red")
plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,2]), xlab="sin. cellcycleR rank", ylab="pca 2 rank", pch=20, lwd=0.5, cex=1, col="red")
library(cellcycleR)
library(wavethresh)
G <- 100;
num_cells <- 256;
amp_genes <- rep(1, G);
phi_genes <- rep(c(2,4,6,8), each=G/4);
sigma_genes <- rchisq(G, 3);
cell_times_sim <- seq(0,2*pi, length.out=num_cells);
cycle_data <- sim_sinusoidal_cycle(G, amp_genes, phi_genes, sigma_genes, cell_times_sim);
celltime_levels <- 256;
sample_reorder <- sample(1:num_cells,num_cells, replace=FALSE);
cell_times_reorder <- cell_times_sim[sample_reorder];
cycle_data_reorder <- cycle_data[sample_reorder,];
We perform PCA on the reordered data.
pca_cycle <- prcomp(cycle_data_reorder, center=TRUE, scale. = TRUE);
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
We apply sinusoidal cellcycleR
system.time(out_sinusoidal <- sin_cell_ordering_class(cycle_data_reorder, celltime_levels = 256, num_iter=100, verbose = FALSE))
## Final loglikelihood, iter 100 : -53835.7277772
## user system elapsed
## 147.811 51.766 67.671
We plot the radial plots first.
library(plotrix)
library(RColorBrewer)
radial.plot(lengths=1:length(out_sinusoidal$cell_times),radial.pos=out_sinusoidal$cell_times[order(cell_times_reorder)],
line.col=colorRampPalette(brewer.pal(9,"Blues"))(length(out_sinusoidal$cell_times)), lwd=2)
We plot the patterns for two genes (for arbitrary indices)
plot(cycle_data_reorder[order(out_sinusoidal$cell_times),1], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data[,1], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data_reorder[order(out_sinusoidal$cell_times),60], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data[,60], type="l", xlab="cell times", ylab="gene expression")
Now we rank the cells based on the cell times estimated from sinusoidal cellcycleR and then plot the ranks against the ranks based on the first PC.
plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,1]), xlab="sin. cellcycleR rank", ylab="pca 1 rank", pch=20, lwd=0.5, cex=1, col="red")
plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,2]), xlab="sin. cellcycleR rank", ylab="pca 2 rank", pch=20, lwd=0.5, cex=1, col="red")
library(cellcycleR)
library(wavethresh)
G <- 100;
num_cells <- 256;
amp_genes <- rep(1, G);
phi_genes <- rep(c(2,4,6,8), each=G/4);
sigma_genes <- rchisq(G, 6);
cell_times_sim <- seq(0,2*pi, length.out=num_cells);
cycle_data <- sim_sinusoidal_cycle(G, amp_genes, phi_genes, sigma_genes, cell_times_sim);
celltime_levels <- 256;
sample_reorder <- sample(1:num_cells,num_cells, replace=FALSE);
cell_times_reorder <- cell_times_sim[sample_reorder];
cycle_data_reorder <- cycle_data[sample_reorder,];
We perform PCA on the reordered data.
pca_cycle <- prcomp(cycle_data_reorder, center=TRUE, scale. = TRUE);
plot(pca_cycle$x[,1], pca_cycle$x[,2], pch=20, lwd=0.01)
We apply sinusoidal cellcycleR
system.time(out_sinusoidal <- sin_cell_ordering_class(cycle_data_reorder, celltime_levels = 256, num_iter=100, verbose=FALSE))
## Final loglikelihood, iter 100 : -76919.9005907
## user system elapsed
## 149.808 54.153 72.759
We plot the radial plots first.
library(plotrix)
library(RColorBrewer)
radial.plot(lengths=1:length(out_sinusoidal$cell_times),radial.pos=out_sinusoidal$cell_times[order(cell_times_reorder)],
line.col=colorRampPalette(brewer.pal(9,"Blues"))(length(out_sinusoidal$cell_times)), lwd=2)
We plot the patterns for two genes (for arbitrary indices)
plot(cycle_data_reorder[order(out_sinusoidal$cell_times),1], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data[,1], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data_reorder[order(out_sinusoidal$cell_times),60], type="l", xlab="cell times", ylab="gene expression")
plot(cycle_data[,60], type="l", xlab="cell times", ylab="gene expression")
Now we rank the cells based on the cell times estimated from sinusoidal cellcycleR and then plot the ranks against the ranks based on the first PC.
plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,1]), xlab="sin. cellcycleR rank", ylab="pca 1 rank", pch=20, lwd=0.5, cex=1, col="red")
plot(rank(out_sinusoidal$cell_times), rank(pca_cycle$x[,2]), xlab="sin. cellcycleR rank", ylab="pca 2 rank", pch=20, lwd=0.5, cex=1, col="red")
It seems that when the SNR is high (Example 1), PC1 versus PC2 looks perfectly circular and cellcycleR does pretty well in recovering sinusoidal patterns. However the ranks from PC plots does not correspond to the ranks from the ordered cell times after we re-order the cells by the cellcycleR method. In Exmaple 2, where there is substantial noise, the circular pattern in the plot of PC1 vs PC2 is noisy, however the cellcycleR still seems to recover correct ordering (check radial plots) and the gene patterns look sinusoidal. In Example 3, when there is much higher noise compared to signal, PC plots are very noisy and circular pattern not visible at all. But although gene patterns no longer look sinusoidal, but the cellcycleR cell ordering does not seem too bad (check radial plot).